********************************************************************************
* concentration.do
* Purpose: Construct two measures of labor market concentration at the
*          CZ (commuting zone) x NAICS industry x year level.
*
*   1. Standard HHI (Herfindahl-Hirschman Index) based on employment shares.
*      HHI = sum of squared employment shares across firms in the market.
*      Market = NAICS industry x worker CZ (sac_syn).
*
*   2. Generalized HHI (GHHI) based on flow-adjusted employment shares,
*      following Arnold (2025). This measure accounts for worker mobility between
*      markets when computing competitive pressure.
*      GHHI uses a flow-weighted denominator (Appendix B5).
*
* Both measures are used in heterogeneity/robustness regressions to capture
* local labor market competitiveness.
*
* Input : $data/worker_YYYY.dta (from clean_worker.do)
* Outputs:
*   $data/concentration.dta  - Standard HHI by CZ x NAICS x year
*   $data/ghhi.dta           - Generalized HHI by CZ x NAICS x year
*   $data/worker_panel.dta   - Deduplicated single-job worker panel (intermediate)
*   $data/alpha.dta          - Market transition weights for GHHI (intermediate)
*   $data/denominator.dta    - Flow-adjusted market size denominators (intermediate)
********************************************************************************
set more off

//==============================================================================
// PART 1: STANDARD HHI BY EMPLOYMENT
//
// For each year, compute the firm's employment share in each CZ x NAICS market,
// then compute HHI = sum of squared shares.
// Market size = total number of T4 earnings records in that market-year.
//==============================================================================
forvalues y = 2001/2017 {

	use $data\worker_`y'.dta, clear

	* Restrict to workers with valid earnings and known commuting zone
	drop if mi(sac_syn) | mi(naics) | mi(t4earn) | t4earn == 0

	* Define market as CZ x NAICS industry
	egen market = group(sac_syn naics)

	* Collapse to firm x market x year: count workers (T4 records) as employment
	collapse (count) total_emp_estab = t4earn (firstnm) sac_syn naics, by(entid_syn market year)

	* Compute market-wide total employment and each firm's share
	egen 	total_emp_market 		= total(total_emp_estab), 	by(market)
	gen		emp_share 				= total_emp_estab/total_emp_market

	* HHI = sum of squared employment shares (ranges from 1/N to 1)
	egen 	hhi_emp					= total(emp_share^2), 		by(market)

	* Collapse to one row per market (CZ x NAICS x year)
	collapse (firstnm) hhi_emp, by(sac_syn naics year)
	save $data/concentration_`y', replace
}

drop _all
forvalues y = 2001/2017 {
	append using $data/concentration_`y', force
	erase $data/concentration_`y'.dta
}

save $data/concentration, replace


//==============================================================================
// PART 2: GENERALIZED (FLOW-ADJUSTED) HHI
//
// The GHHI adjusts the market size denominator using worker transition rates
// between markets. This reflects the degree to which workers from market m
// can substitute across markets, capturing "effective" competitive pressure.
//
// Steps:
//   Part 0: Build a single-job worker panel (dominant job per worker-year)
//   Part 1: Estimate market-to-market transition weights (alpha_m'm)
//   Part 2: Compute flow-adjusted market size denominators per year
//   Part 3: Compute GHHI using flow-adjusted employment shares
//==============================================================================

clear

//------------------------------------------------------------------------------
// PART 2, STEP 0: CREATE SINGLE-JOB WORKER PANEL (worker_panel.dta)
//
// Each worker is assigned to their highest-paying job in each year.
// This avoids double-counting workers who hold multiple jobs.
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data\worker_`y', clear

	* Keep the highest-paying job per worker-year
	drop if mi(t4earn) | t4earn == 0
	gsort casenum2019 -t4earn entid_syn
	duplicates drop casenum2019, force

	drop if mi(sac_syn) | mi(naics)

	compress
	save $data/worker_panel_`y', replace
}

clear
set obs 0
forvalues y = 2001/2017 {
	append using $data/worker_panel_`y', force
	erase $data/worker_panel_`y'.dta
}

* Create numeric IDs for all panel dimensions (market, worker, firm, time)
* These are required by the GHHI algorithm
egen market 	= group(naics sac_syn)
egen workerid 	= group(casenum2019)
egen firm		= group(entid_syn)
egen time		= group(year)

keep sac_syn naics t4earn year entid_syn market workerid firm time

compress
save $data/worker_panel, replace

* Store global variable names for the GHHI computation
global market 	= "market"
global workerid = "workerid"
global firm 	= "firm"
global time 	= "time"


//------------------------------------------------------------------------------
// PART 2, STEP 1: ESTIMATE ALPHA (market transition weights)
//
// Alpha_m'm = (transition_rate_m'm / within-market rate_m'm) * relative_size_m'm
// This is Equation 10 in the paper (following Jarosch et al. / Schubert et al.).
//
// Interpretation: alpha reflects how easily a worker from market m can be
// replaced by a worker currently in market m'. Higher alpha means market m'
// is a strong substitute labor supply for market m.
//------------------------------------------------------------------------------
use $data/worker_panel, replace

* Market size = number of worker-year observations in each market (across all years)
bys market: gen market_size = _N

* Sort by worker x time to identify job transitions
gsort workerid time

* Lag variables to identify origin market and firm
bys workerid: gen lagfirm           = firm[_n-1]
bys workerid: gen origin_market     = market[_n-1]
bys workerid: gen origin_market_size = market_size[_n-1]

* Drop first observation of each worker (no lagged market available)
drop if lagfirm == .

* Keep only actual job transitions (firm changes)
keep if firm!=lagfirm

gen count = 1   // counter for transition-pair frequency

* Clarify variable names: current market = destination
rename market destination_market
rename market_size destination_market_size

order origin_market destination_market

* Collapse to market-pair level: count transitions and record market sizes
collapse (mean) destination_market_size origin_market_size (count) count, by(origin_market destination_market)

* Total number of transitions out of each origin market
bys origin_market: egen total_transitions = total(count)

* Transition rate from origin m to destination m'
gen transition_rate = count/total_transitions

* Relative size of origin vs. destination market
gen relative_size = origin_market_size/destination_market_size

* Within-market transition rate p_m_m (moves within the same market)
gen temp = transition_rate if destination_market==origin_market
bys origin_market: egen p_m_m = mean(temp)
drop temp

* Compute alpha: normalized transition rate scaled by relative market size (Equation 10)
gen alpha = (transition_rate/p_m_m)*(relative_size)

keep alpha origin_market destination_market

save $data/alpha.dta, replace


//------------------------------------------------------------------------------
// PART 2, STEP 2: CONSTRUCT FLOW-ADJUSTED DENOMINATORS (denominator.dta)
//
// For each market m in each year, the denominator is:
//   D_m = sum_{m'} alpha_{m'm} * N_{m't}   (Equation 12)
// where N_{m't} is the size of destination market m' in year t.
// This represents the "effective" supply of labor available to market m.
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data/worker_panel if year == `y', replace

	rename market origin_market

	* Compute market sizes for this year
	bys origin_market: gen origin_market_size = _N
	bys origin_market: gen seq = _n
	keep if seq==1
	drop seq workerid firm

	* Create a dataset of destination market sizes (needed for alpha weighting)
	* NOTE: this approach is somewhat inefficient but ensures correct merging
	preserve
	rename origin_market destination_market
	rename origin_market_size destination_market_size
	save $data/temp_market.dta, replace
	restore

	* Merge alpha weights for all origin x destination pairs
	merge 1:m origin_market using $data/alpha, nogen
	sort origin_market destination_market

	* Merge destination market sizes
	merge m:1 destination_market using $data/temp_market, nogen
	sort origin_market destination_market

	* Flow-adjusted market size: weighted sum of destination market sizes (Equation 12)
	gen weighted_employment = alpha*destination_market_size

	collapse (sum) weighted_employment, by(origin_market)

	rename origin_market market
	* weighted_employment is now the GHHI denominator for this market-year

	gen year = `y'

	save $data/denominator_`y', replace
}

drop _all
forvalues y = 2001/2017 {
	append using $data/denominator_`y', force
	erase $data/denominator_`y'.dta
}

compress
save $data/denominator, replace


//------------------------------------------------------------------------------
// PART 2, STEP 3: COMPUTE GENERALIZED HHI (ghhi.dta)
//
// GHHI = sum of flow-adjusted squared employment shares:
//   GHHI_m = sum_j (n_jm / D_mt)^2
// where n_jm = number of unique workers at establishment j in market m,
// and D_mt = flow-adjusted denominator from Step 2.
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {
	use $data/worker_panel if year == `y', replace

	drop if mi(sac_syn) | mi(naics) | mi(t4earn) | t4earn == 0

	* Define establishment = firm x CZ (a firm may operate in multiple CZs)
	egen establishmentid = group(entid_syn sac_syn)

	* Count unique workers per establishment in this market-year
	gcollapse (nunique) workerid (firstnm) market sac_syn naics year, by(establishmentid)

	* Merge flow-adjusted denominator
	merge m:1 market year using $data/denominator, keep(3) nogen

	* Compute flow-adjusted employment share for each establishment
	gen 	emp_share			=	workerid/weighted_employment

	* GHHI = sum of squared flow-adjusted shares within each CZ x NAICS market
	egen	ghhi_emp			=	total(emp_share^2), by(sac_syn naics)

	* Collapse to one row per market (CZ x NAICS x year)
	gcollapse (firstnm) ghhi_emp, by(sac_syn naics year)

	compress
	save $data/ghhi_`y', replace
}

drop _all
forvalues y = 2001/2017 {
	append using $data/ghhi_`y', force
	erase $data/ghhi_`y'.dta
}

compress
save $data/ghhi, replace
